# install.packages("tidyverse")
# install.packages("timetk")
library(tidyverse)
library(readr) #read_csv()
library(dplyr)
library(glmnet)
library(timetk)
library(rsample)
# Load data:
df <- read_csv("./data/walmart_features.csv")
head(df)
Data Preprocessing + Sort Values Notes: Date is already in Date format. Here we just are converting store, holiday_flag from numeric/character to categorical type and dropping redundant columns.
print(sapply(df, class)) # view col datatypes
store date year month week is_q4 weekly_sales holiday_flag
"numeric" "Date" "numeric" "numeric" "numeric" "logical" "numeric" "character"
temperature fuel_price cpi unemployment lag1 lag2 lag4 lag8
"numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
ma4 ma8 store_mean_to_prev store_sd_to_prev sin1 cos1 sin2 cos2
"numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
is_holiday_num inter_holiday_lag1 inter_temp_holiday inter_cpi_unemp
"numeric" "numeric" "numeric" "numeric"
df <- df %>%
mutate(
store = as.factor(store),
holiday_flag = factor(holiday_flag, levels = unique(holiday_flag))
) %>%
arrange(date, store) %>% # Sort by date, then store, ascending order
select(-is_holiday_num) # drop col
head(df)
# Checking the data (uncomment to see output if you wish)
print(dim(df))
[1] 6075 27
#table(df$date) # get value_counts per date -> is 45.
There are 45 stores, so this makes sense, that there are 45 entries for each date. Also, each date represents one week.
Here we perform an 80/20 train/test split to hold back data for comparing different models’ performances. This is not a random split, as we need to preserve time ordering here. Also, it is not a 80/20 split on the row indices, since that could cause some stores with the same date to get split up.
# Train/Test Split:
all_dates <- unique(df$date) # already sorted
cutoff_ix <- floor(0.8 * length(all_dates))
cutoff_date <- all_dates[cutoff_ix]
print(cutoff_date)
[1] "2012-04-20"
train_df <- df %>% filter(date <= cutoff_date)
test_df <- df %>% filter(date > cutoff_date)
print(dim(train_df))
[1] 4860 27
print(dim(test_df))
[1] 1215 27
cat("train:", format(min(train_df$date), "%Y-%m-%d"), " to ", format(max(train_df$date), "%Y-%m-%d"),"\n")
train: 2010-04-02 to 2012-04-20
cat("test:", format(min(test_df$date), "%Y-%m-%d"), " to ", format(max(test_df$date), "%Y-%m-%d"),"\n")
test: 2012-04-27 to 2012-10-26
print(colnames(df))
[1] "store" "date" "year" "month" "week" "is_q4" "weekly_sales"
[8] "holiday_flag" "temperature" "fuel_price" "cpi" "unemployment" "lag1" "lag2"
[15] "lag4" "lag8" "ma4" "ma8" "store_mean_to_prev" "store_sd_to_prev" "sin1"
[22] "cos1" "sin2" "cos2" "inter_holiday_lag1" "inter_temp_holiday" "inter_cpi_unemp"
Because it is important to keep the future in the test, or else the model could cheat and could use info from the future to predict the past (leakage).
# NOTE - ctrl shift c to mass uncomment/comment out
# Attempt 1 - shorter
# cv_folds <- time_series_cv(
# data = train_df,
# date_var = Date,
# cumulative = FALSE, # no growing window (same size folds)
# initial = "6 months", # length of train window
# assess = "3 months", # length of validation window
# skip = "3 months", # jump between folds
# slice_limit = 5 # num folds max
# )
# Attempt 2-- longer, cumulative windowing
cv_folds <- time_series_cv(
data = train_df,
date_var = date,
cumulative = TRUE, # growing window (same size folds)
initial = "1 year", # length of train window
assess = "3 months", # length of validation window
skip = "3 months", # jump between folds
slice_limit = 5 # num folds max
)
plot_time_series_cv_plan(cv_folds, .date_var = date, .value = weekly_sales, .interactive=TRUE)
Additional setup: Define Functions for Standardization Normalize weekly sales within each store, so that no one store dominates, and weekly sales are comparable. i.e. (sales_store - mean(sales_store)) / std(sales_store) and can convert back later via (scaled_sales * store_std) + store_mean
##################### Fit Standard Scaler ###########################
# Returns: mean, std for each col to be used for scaling data.
scaler_fit <- function(df, cols_to_standardize) {
# For each store, compute mean and std of each col to be standardized
stats <- df %>%
group_by(store) %>%
summarise(
across(
all_of(cols_to_standardize),
list(
mean = ~mean(.x, na.rm = TRUE),
sd = ~sd(.x, na.rm = TRUE)
)
),
.groups = "drop"
)
# Return a list of std, mean for each col:
list(
stats = stats,
cols = cols_to_standardize
)
}
###################### Transform using scaler ############################
# Returns: copy of df, with cols scaled using the scaler.
# (Replaces the columns, and no extra cols are added.)
scaler_transform <- function(df, scaler, cols = scaler$cols) {
stats <- scaler$stats
# join stats onto df by store:
df2 <- df %>% left_join(stats, by = "store") # create new df2
# For each column, transform via (x - mean) / std, using fitted stats:
for (col in cols) {
mean_col <- paste0(col, "_mean") # because stats contains cols with these names
sd_col <- paste0(col, "_sd")
df2[[col]] <- (df2[[col]] - df2[[mean_col]]) / df2[[sd_col]] # overwrite col
}
# remove the helper mean/std columns, so no clutter, and return df2:
df2 %>% select(-ends_with("_mean"), -ends_with("_sd"))
}
####################### Un-standardize a col (e.g. yhat) #############
scaler_inverse_column <- function(df, scaler, col) {
stats <- scaler$stats
# Keep only stats for this column, store
stats_small <- stats %>%
select(
store,
paste0(col, "_mean"),
paste0(col, "_sd")
)
df2 <- df %>% left_join(stats_small, by = "store")
mean_col <- paste0(col, "_mean")
sd_col <- paste0(col, "_sd")
# Undo z-score: x_raw = x_scaled * sd + mean
df2[[col]] <- df2[[col]] * df2[[sd_col]] + df2[[mean_col]]
# Drop helper stats
df2 %>% select(-ends_with("_mean"), -ends_with("_sd"))
}
sales_cols_to_standardize <- c(
"weekly_sales",
"lag1", "lag2", "lag4", "lag8",
"ma4", "ma8",
"store_mean_to_prev", "store_sd_to_prev",
"inter_holiday_lag1"
)
Additional setup: Define Cross Validation into one re-usable function Rolling Time-series cross validation for hyperparam selection for lasso: Note: cv_folds contains the full train data. And, design_matrix_formula contains the specification for the features we want for this problem. So, it all checks out.
lasso_cv <- function(cv_folds, lambda_grid, design_matrix_formula,
cols_to_standardize=sales_cols_to_standardize) {
n_folds <- length(cv_folds$splits)
n_lambdas <- length(lambda_grid)
cv_mses <- numeric(n_lambdas) # keep track of MSE across different lambdas
lambda_ix <- 1
for (lambda in lambda_grid) {
fold_mses <- numeric(n_folds) # array for keeping track of mse across folds
for (fold_ix in 1:n_folds) {
# Get train, valid data for current cv fold:
fold <- cv_folds$splits[[fold_ix]]
train_data <- analysis(fold)
valid_data <- assessment(fold)
# Standardize cols:
scaler <- scaler_fit(train_data, cols_to_standardize)
train_scaled <- scaler_transform(train_data, scaler)
valid_scaled <- scaler_transform(valid_data, scaler)
# Manipulate data into design matrices:
X_train <- model.matrix(design_matrix_formula, data=train_scaled)
y_train <- train_scaled$weekly_sales
X_val <- model.matrix(design_matrix_formula, data=valid_scaled)
y_val <- valid_scaled$weekly_sales
# Fit LASSO (non-cv version):
lasso_model <- glmnet(X_train, y_train, alpha=1, lambda=lambda)
# Get val error:
yhat_val <- predict(lasso_model, newx=X_val, s=lambda)
fold_mses[fold_ix] <- mean((y_val - yhat_val)^2)
}
cv_mses[lambda_ix] <- mean(fold_mses)
lambda_ix <- lambda_ix + 1
}
best_lambda <- lambda_grid[which.min(cv_mses)]
cat("best lambda: ", best_lambda, "with min mse ", min(cv_mses), "\n\n")
plot(x=lambda_grid,
y=cv_mses,
main = paste0(n_folds, "-Fold Cross-Validation MSE for different Lambdas"),
xlab = "Lambda",
ylab = "Mean CV MSE")
best_lambda
}
# Define design matrix for this test.
design_matrix_formula_baseline_lasso <- weekly_sales ~ holiday_flag + temperature + fuel_price + cpi + unemployment + 0 # 0 means no intercept, glmnet will add its own;
# Define lambda search grid--an easy way to get one with good scaling is from glmnet function itself:
scaler <- scaler_fit(train_df, sales_cols_to_standardize)
train_scaled <- scaler_transform(train_df, scaler)
unused_X_all <- model.matrix(design_matrix_formula_baseline_lasso, data=train_scaled)
unused_y_all <- train_scaled$weekly_sales
unused_model <- glmnet(unused_X_all, unused_y_all, alpha=1)
lambda_grid <- unused_model$lambda
lambda_grid
[1] 0.1321184830 0.1203814411 0.1096870857 0.0999427873 0.0910641455 0.0829742578 0.0756030534 0.0688866865 0.0627669832 0.0571909374 0.0521102521
[12] 0.0474809210 0.0432628468 0.0394194947 0.0359175755 0.0327267570 0.0298194022 0.0271703286 0.0247565915 0.0225572842 0.0205533572 0.0187274536
[23] 0.0170637582 0.0155478610 0.0141666319 0.0129081073 0.0117613867 0.0107165375 0.0097645098 0.0088970577 0.0081066676 0.0073864936 0.0067302979
[34] 0.0061323968 0.0055876115 0.0050912235 0.0046389333 0.0042268232 0.0038513239 0.0035091829 0.0031974368 0.0029133853 0.0026545682 0.0024187436
[45] 0.0022038691 0.0020080835 0.0018296909 0.0016671462 0.0015190415 0.0013840940 0.0012611349 0.0011490992 0.0010470164 0.0009540023 0.0008692514
[56] 0.0007920295 0.0007216678 0.0006575568 0.0005991413 0.0005459152 0.0004974176 0.0004532284
best_lambda <- lasso_cv(cv_folds, lambda_grid, design_matrix_formula_baseline_lasso)
best lambda: 0.007386494 with min mse 1.319609
# Fit model with best lambda on full data, get test error:
test_scaled <- scaler_transform(test_df, scaler)
X_test <- model.matrix(design_matrix_formula_baseline_lasso, data=test_scaled)
y_test_unscaled <- test_df$weekly_sales # not scaled
X_train <- model.matrix(design_matrix_formula_baseline_lasso, data=train_scaled)
y_train <- train_scaled$weekly_sales
best_lasso_model <- glmnet(X_train, y_train, alpha=1, lambda=best_lambda)
# Get val error:
yhat_test <- predict(best_lasso_model, newx=X_test, s=best_lambda)
# Un-scale: (need to build a small fake dataframe for this with store id)
pred_df <- test_df %>%
select(store) %>%
mutate(weekly_sales = yhat_test) # note that this does not actually modify test_df
pred_df_unscaled <- scaler_inverse_column(pred_df, scaler, "weekly_sales")
yhat_test_unscaled <- pred_df_unscaled$weekly_sales
mse_test <- mean((y_test_unscaled - yhat_test_unscaled)^2)
cat("Test MSE (unscaled): ", mse_test, "\n\n")
Test MSE (unscaled): 9717192723
wape_test <- sum(abs(y_test_unscaled - yhat_test_unscaled)) / sum(abs(y_test_unscaled))
cat("Test WAPE:", wape_test, "\n\n")
Test WAPE: 0.06438912
coef(best_lasso_model)
7 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 7.055992e-01
holiday_flagNon-Holiday -4.144757e-01
holiday_flagHoliday 4.963938e-14
temperature -5.327214e-03
fuel_price -1.905627e-02
cpi 3.301644e-04
unemployment .
# colnames(train_df)
# Define design matrix for this test.
design_matrix_formula_all_features <- weekly_sales ~ . - store - date + 0 # 0 means no intercept, glmnet will add its own;
# Define lambda search grid--an easy way to get one with good scaling is from glmnet function itself:
scaler <- scaler_fit(train_df, sales_cols_to_standardize)
train_scaled <- scaler_transform(train_df, scaler)
unused_X_all <- model.matrix(design_matrix_formula_all_features, data=train_scaled)
unused_y_all <- train_scaled$weekly_sales
unused_model <- glmnet(unused_X_all, unused_y_all, alpha=1)
lambda_grid <- unused_model$lambda
lambda_grid
[1] 0.3932540160 0.3583184131 0.3264863928 0.2974822415 0.2710547391 0.2469749831 0.2250344063 0.2050429699 0.1868275175 0.1702302758 0.1551074873
[12] 0.1413281656 0.1287729609 0.1173331260 0.1069095744 0.0974120225 0.0887582069 0.0808731724 0.0736886227 0.0671423285 0.0611775891 0.0557427407
[23] 0.0507907093 0.0462786026 0.0421673391 0.0384213089 0.0350080657 0.0318980457 0.0290643112 0.0264823179 0.0241297018 0.0219860856 0.0200329023
[34] 0.0182532345 0.0166316674 0.0151541560 0.0138079025 0.0125812465 0.0114635632 0.0104451719 0.0095172516 0.0086717652 0.0079013895 0.0071994518
[45] 0.0065598724 0.0059771114 0.0054461213 0.0049623029 0.0045214656 0.0041197910 0.0037538001 0.0034203229 0.0031164708 0.0028396121 0.0025873488
[56] 0.0023574959 0.0021480625 0.0019572345 0.0017833592 0.0016249305 0.0014805761 0.0013490458 0.0012292003 0.0011200015 0.0010205037 0.0009298450
[67] 0.0008472401 0.0007719736 0.0007033936 0.0006409061 0.0005839697 0.0005320915 0.0004848219 0.0004417517 0.0004025077 0.0003667500 0.0003341690
best_lambda <- lasso_cv(cv_folds, lambda_grid, design_matrix_formula_all_features)
best lambda: 0.007901389 with min mse 0.6604256
# Fit model with best lambda on full data, get test error:
test_scaled <- scaler_transform(test_df, scaler)
X_test <- model.matrix(design_matrix_formula_all_features, data=test_scaled)
y_test_unscaled <- test_df$weekly_sales # not scaled
X_train <- model.matrix(design_matrix_formula_all_features, data=train_scaled)
y_train <- train_scaled$weekly_sales
best_lasso_model <- glmnet(X_train, y_train, alpha=1, lambda=best_lambda)
# Get val error:
yhat_test <- predict(best_lasso_model, newx=X_test, s=best_lambda)
# Un-scale: (need to build a small fake dataframe for this with store id)
pred_df <- test_df %>%
select(store) %>%
mutate(weekly_sales = yhat_test) # note that this does not actually modify test_df
pred_df_unscaled <- scaler_inverse_column(pred_df, scaler, "weekly_sales")
yhat_test_unscaled <- pred_df_unscaled$weekly_sales
mse_test <- mean((y_test_unscaled - yhat_test_unscaled)^2)
cat("Test MSE (unscaled): ", mse_test, "\n\n")
Test MSE (unscaled): 4664649702
wape_test <- sum(abs(y_test_unscaled - yhat_test_unscaled)) / sum(abs(y_test_unscaled))
cat("Test WAPE:", wape_test, "\n\n")
Test WAPE: 0.04349279
coef(best_lasso_model)
26 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) -1.888898e+02
year 9.312715e-02
month 2.009185e-01
week 8.134132e-06
is_q4FALSE -2.730022e-01
is_q4TRUE .
holiday_flagHoliday 5.465857e+00
temperature .
fuel_price 3.304306e-02
cpi .
unemployment -2.463699e-03
lag1 2.520160e-01
lag2 1.157488e-01
lag4 2.403907e-01
lag8 7.038497e-02
ma4 6.324143e-02
ma8 -5.589519e-02
store_mean_to_prev .
store_sd_to_prev -7.858224e-02
sin1 8.055230e-01
cos1 2.309093e-02
sin2 2.121380e-01
cos2 1.207982e-01
inter_holiday_lag1 -1.553990e+00
inter_temp_holiday 2.827708e-03
inter_cpi_unemp .
Sort coeffs
coefs <- as.matrix(coef(best_lasso_model))[, 1]
coefs_sorted <- sort(abs(coefs), decreasing = TRUE)
# Print sorted coefficients with signs
coef_df <- data.frame(
feature = names(coefs_sorted),
coef = coefs[names(coefs_sorted)]
)
coef_df
First CV param attempt notes: The above serves as a baseline, when using no time-averaging or time-lagged features. It could just predict the mean, not finding a strong signal with the other features. Next, we will need to try incorporating time. In the meantime, some things I will try are changing the folds and lambdas used.
second CV param attempt notes: Second attempt, used 1 year initial, and cumulative folds. Now holidays and temperature matter, which is better than the first attempt, which had 0’s for all features and intercept was the only kept feature. MSE stayed the same.
LASSO - baseline, NO lagged variables.
Lasso Path–shows variables entering and leaving as lambda (regularization) increases. (NOT cross val).
fit_path <- glmnet(X_train, y_train, alpha = 1) # doesnt take best lambda, fits a bunch of lambdas, no CV
plot(fit_path, xvar = "lambda", label = TRUE)
title("LASSO Coefficient Paths")
# attempting to get a mapping to var names but this doesnt include intercept (TODO)
beta_mat <- as.matrix(fit_path$beta)
#rownames(beta_mat)
variable_map <- data.frame(
index = seq_len(nrow(beta_mat)),
variable = rownames(beta_mat)
)
print(variable_map)
# 1) Extract coefficients for each tested lambda value, and convert datatype to R matrix:
Beta_lambda <- as.matrix(coef(fit_path, s=fit_path$lambda))
# 2) Even tho I didn't specify intercept, there is a intercept row of 0's.
#Beta_lambda <- Beta_lambda[-1,, drop=FALSE] # all rows except row 1, all columns; dont change dimensions
# print(Beta_lambda)
cat('Beta_lambda shape:', dim(Beta_lambda), '\n') # shape: (p x #lambdas
# 3) Get L0, L1 norms:
l1_norm = colSums(abs(Beta_lambda)) # sum up each betah_lambda col; colSums computes sum of each column
l0_norm = colSums(Beta_lambda != 0) # number of nonzero coeffs for each lambda
# 4) Want the plot's x axis to be increasing model complexity (incr L1 norm to the right)
incr_order_ixs = order(l1_norm) # returns indices to make it incr order
l1_norm_ordered = l1_norm[incr_order_ixs]
l0_norm_ordered = l0_norm[incr_order_ixs]
Beta_lambda_ordered = Beta_lambda[,incr_order_ixs,drop=FALSE]
# 5) Plot coeff vs. L1 norm:
# note: currently, Beta_lambda's columns are for each lambda, but matplot plots columns of y
# as separate lines. We want a line for each coeff (p). Beta_lambda is (p x #lambdas).
# so, we take the transpose of beta.
# also, num rows should match for x (#lambdas x 1) and y (p x #lambdas).
cat('L1 norm shape:', length(l1_norm_ordered), '\n')
cat('Beta_lambda shape:', dim(Beta_lambda_ordered), '\n')
matplot(x=l1_norm, y=t(Beta_lambda),
type="l",# plot type: line
lwd=2, # line width
lty=1, # line type (solid, dashed)
xlab=expression("L1 Norm (" * "||" * hat(beta[lambda]) * "||"[1] * ")"),
ylab="Coefficients",
main="Lasso Path")
abline(h = 0, col = "black", lwd = 1, lty = 2) # show x axis more easily